rm(list=ls())
require(quanteda)
require(stringi)
source("functions.R")

# training corpus
corp <- readRDS("data_corpus_ja.RDS") %>% 
    corpus_subset(!dupli & noise < quantile(noise, 0.9, na.rm = TRUE) & n_sent > 1)

toks <- tokenize_japanese(corp)
toks <- fix_japanese(toks) %>% 
    tokens_remove(stopwords_ja)
saveRDS(toks, "data_tokens_ja.RDS")

# test corpus
load("collocations_ja.Rdata")
corp_mnu <- readRDS("data_corpus_manual_ja.RDS")
toks_mnu <- tokens(corp_mnu , remove_url = TRUE)
toks_mnu <- fix_japanese(toks_mnu)
toks_mnu <- tokens_select(toks_mnu, "^[０-９ぁ-んァ-ヶー一-龠]+$", valuetype = 'regex', padding = TRUE)
toks_mnu <- tokens_remove(toks_mnu , stopwords_ja, padding = TRUE)
toks_mnu <- tokens_compound(toks_mnu , seqs_kanji[seqs_kanji$z > 3], 
                            concatenator = '', join = TRUE)
toks_mnu <- tokens_compound(toks_mnu , seqs_kana[seqs_kana$z > 3], 
                            concatenator = '', join = TRUE)
toks_mnu <- tokens_compound(toks_mnu , seqs_any[seqs_any$z > 3], 
                            concatenator = '', join = TRUE)
saveRDS(toks_mnu, "data_tokens_manual_ja.RDS")
